This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
library(lubridate) # Working with Dates
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(reshape)
##
## Attaching package: 'reshape'
## The following object is masked from 'package:lubridate':
##
## stamp
## The following object is masked from 'package:dplyr':
##
## rename
library(reshape2)# Reshaping the data
##
## Attaching package: 'reshape2'
## The following objects are masked from 'package:reshape':
##
## colsplit, melt, recast
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
##
## smiths
## The following objects are masked from 'package:reshape':
##
## expand, smiths
library(tidyverse)# brings in ggplot2 and dplyr together
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ stringr 1.4.0
## ✓ tibble 3.0.4 ✓ forcats 0.5.0
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x lubridate::as.difftime() masks base::as.difftime()
## x lubridate::date() masks base::date()
## x tidyr::expand() masks reshape::expand()
## x dplyr::filter() masks stats::filter()
## x lubridate::intersect() masks base::intersect()
## x dplyr::lag() masks stats::lag()
## x reshape::rename() masks dplyr::rename()
## x lubridate::setdiff() masks base::setdiff()
## x reshape::stamp() masks lubridate::stamp()
## x lubridate::union() masks base::union()
library(data.table)
##
## Attaching package: 'data.table'
## The following object is masked from 'package:purrr':
##
## transpose
## The following objects are masked from 'package:reshape2':
##
## dcast, melt
## The following object is masked from 'package:reshape':
##
## melt
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(zoo) ## datetime
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(bizdays) # business day
##
## Attaching package: 'bizdays'
## The following object is masked from 'package:stats':
##
## offset
library(ggeasy) # for easy ggplot editing
library(harrypotter) # for palettes
library(bizdays)
library(timeDate)
## Visualizations
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:reshape':
##
## rename
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
##
## select
## The following object is masked from 'package:dplyr':
##
## select
library(Rcpp)
library(tsibble)
##
## Attaching package: 'tsibble'
## The following object is masked from 'package:zoo':
##
## index
## The following object is masked from 'package:data.table':
##
## key
## The following object is masked from 'package:lubridate':
##
## interval
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
dat_train = data.frame(read.csv("forecast_traindata.csv"))
dat_test = data.frame(read.csv("forecast_testdata.csv"))
dat_train$application_date<-as.Date(dat_train$application_date,format = "%d-%m-%Y")
dat_test$application_date<-as.Date(dat_test$application_date,format = "%d-%m-%Y")
str(dat_train)
## 'data.frame': 79922 obs. of 6 variables:
## $ application_date: Date, format: "2017-04-01" "2017-04-01" ...
## $ segment : int 1 1 1 1 1 1 1 1 1 1 ...
## $ branch_id : int 1 3 5 7 8 9 10 11 13 14 ...
## $ state : chr "WEST BENGAL" "DELHI" "KARNATAKA" "WEST BENGAL" ...
## $ zone : chr "EAST" "NORTH" "SOUTH" "EAST" ...
## $ no_of_applicants: int 40 58 10 2 13 11 0 9 1 0 ...
summary(dat_train)
## application_date segment branch_id state
## Min. :2017-04-01 Min. :1.000 Min. : 1.0 Length:79922
## 1st Qu.:2017-10-26 1st Qu.:1.000 1st Qu.: 36.0 Class :character
## Median :2018-05-13 Median :1.000 Median : 82.0 Mode :character
## Mean :2018-05-10 Mean :1.168 Mean :118.8
## 3rd Qu.:2018-11-25 3rd Qu.:1.000 3rd Qu.:248.0
## Max. :2019-06-23 Max. :2.000 Max. :271.0
## NA's :2490 NA's :2490 NA's :15514
## zone no_of_applicants
## Length:79922 Min. : 0.0
## Class :character 1st Qu.: 0.0
## Mode :character Median : 17.0
## Mean : 184.9
## 3rd Qu.: 60.0
## Max. :13787.0
## NA's :2490
dim(dat_train)
## [1] 79922 6
glimpse(dat_train)
## Rows: 79,922
## Columns: 6
## $ application_date <date> 2017-04-01, 2017-04-01, 2017-04-01, 2017-04-01, 2017…
## $ segment <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ branch_id <int> 1, 3, 5, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19,…
## $ state <chr> "WEST BENGAL", "DELHI", "KARNATAKA", "WEST BENGAL", "…
## $ zone <chr> "EAST", "NORTH", "SOUTH", "EAST", "EAST", "EAST", "EA…
## $ no_of_applicants <int> 40, 58, 10, 2, 13, 11, 0, 9, 1, 0, 5, 1, 0, 0, 17, 8,…
dat_train$application_date<-as.Date(dat_train$application_date,format = "%d-%m-%Y")
str(dat_train)
## 'data.frame': 79922 obs. of 6 variables:
## $ application_date: Date, format: "2017-04-01" "2017-04-01" ...
## $ segment : int 1 1 1 1 1 1 1 1 1 1 ...
## $ branch_id : int 1 3 5 7 8 9 10 11 13 14 ...
## $ state : chr "WEST BENGAL" "DELHI" "KARNATAKA" "WEST BENGAL" ...
## $ zone : chr "EAST" "NORTH" "SOUTH" "EAST" ...
## $ no_of_applicants: int 40 58 10 2 13 11 0 9 1 0 ...
dat_train$Day<-format(dat_train$application_date,"%d")
dat_train$WeekOfDay = format(dat_train$application_date, format = "%a") ## Getting days of week
dat_train$Weekly = week(dat_train$application_date) #Getting the week of date
dat_train$Month<-format(dat_train$application_date,"%b")
dat_train$Year<-format(dat_train$application_date,"%Y")
dat_train$MonYr = format(dat_train$application_date, "%b-%Y") #Extracting yearly
dat_train$YearQrt = as.yearqtr(dat_train$application_date)#Extracting yearquarterly
dat_train$qrtr<-quarters(dat_train$application_date) # extracting quarter
dat_train$YearQrt = as.character(as.yearqtr(dat_train$application_date))#Extracting quarterly
dat_train$Qrt = as.character(quarters(dat_train$application_date))#Extracting quarterly
dat_train
split_data <- split(dat_train, f = dat_train$segment)
df_segment1<-dat_train %>%
filter(segment==1)
dim(df_segment1)
## [1] 64408 15
DT <- rbindlist(split_data[2])
df_segment2<-as.data.frame(DT)
dim(df_segment2)
## [1] 13024 15
df_segment2<-dplyr::select(df_segment2,-c("branch_id","zone"))
dim(df_segment2)
## [1] 13024 13
df_segment2
#segmnet1
ggplot(df_segment1,aes (x=reorder(state,no_of_applicants),
y=no_of_applicants)) + geom_bar(stat="summary",fun="sum", width=0.5, fill = "Darkred")+
labs(x="state",y = "no_of_applicants", fill="state") +
ggtitle("state wise contribution in Segment1 ")+
theme_bw()+
theme(axis.text.x=element_text(angle=60,hjust=1))
#segment2
ggplot(df_segment2,aes (x=reorder(state ,no_of_applicants),
y=no_of_applicants)) + geom_bar(stat="summary",fun="sum", width=0.5, fill = "Darkred")+
labs(x="state",y = "no_of_applicants", fill="state") +
ggtitle("state wise contribution in Segment2 ")+
theme_bw()+
theme(axis.text.x=element_text(angle=60,hjust=1))
p17<-df_segment2 %>%
filter(Year==2017)
aggData <- aggregate(x =p17$`no_of_applicants`,
by=list(state_wise = p17$state),
FUN = mean)
ggplot(data = aggData, aes(x = state_wise, y = prop.table(stat(aggData$x)), fill = state_wise, label = scales::percent(prop.table(stat(aggData$x))))) +
geom_bar(stat="identity", position = "dodge") +
geom_text(stat = 'identity', position = position_dodge(.9), vjust = -0.5, size = 2) +
scale_y_continuous(labels = scales::percent) +
theme_bw()+
theme(axis.text.x=element_text(angle=60,hjust=1))+
labs(x = 'state', y = 'applicants in percntage', fill = 'state') +
ggtitle("Percentage of Applicants in segment 2 (2017)")
#2018
p18<-df_segment2 %>%
filter(Year==2018)
aggData <- aggregate(x =p18$`no_of_applicants`,
by=list(state_wise = p18$state),
FUN = mean)
ggplot(data = aggData, aes(x = state_wise, y = prop.table(stat(aggData$x)), fill = state_wise, label = scales::percent(prop.table(stat(aggData$x))))) +
geom_bar(stat="identity", position = "dodge") +
geom_text(stat = 'identity', position = position_dodge(.9), vjust = -0.5, size = 2) +
scale_y_continuous(labels = scales::percent) +
theme_bw()+
theme(axis.text.x=element_text(angle=60,hjust=1))+
labs(x = 'state', y = 'applicants in percntage', fill = 'state') +
ggtitle("Percentage of Applicants segment 2 (2018)")
#2019
p19<-df_segment2 %>%
filter(Year==2019)
aggData <- aggregate(x =p19$`no_of_applicants`,
by=list(state_wise = p19$state),
FUN = mean)
ggplot(data = aggData, aes(x = state_wise, y = prop.table(stat(aggData$x)), fill = state_wise, label = scales::percent(prop.table(stat(aggData$x))))) +
geom_bar(stat="identity", position = "dodge") +
geom_text(stat = 'identity', position = position_dodge(.9), vjust = -0.5, size = 2) +
scale_y_continuous(labels = scales::percent) +
theme_bw()+
theme(axis.text.x=element_text(angle=60,hjust=1))+
labs(x = 'state', y = 'applicants in percntage', fill = 'state') +
ggtitle("Percentage of Applicants segment 2 (2018)")
segment17<-df_segment1 %>%
filter(Year==2017)
aggData <- aggregate(x =segment17$`no_of_applicants`,
by=list(state_wise = segment17$state),
FUN = mean)
ggplot(data = aggData, aes(x = state_wise, y = prop.table(stat(aggData$x)), fill = state_wise, label = scales::percent(prop.table(stat(aggData$x))))) +
geom_bar(stat="identity", position = "dodge") +
geom_text(stat = 'identity', position = position_dodge(.9), vjust = -0.5, size = 2) +
scale_y_continuous(labels = scales::percent) +
theme_bw()+
theme(axis.text.x=element_text(angle=60,hjust=1))+
labs(x = 'state', y = 'applicants in percntage', fill = 'state') +
ggtitle("Percentage of Applicants segment 1 (2017)")
#2018
segment18<-df_segment1 %>%
filter(Year==2018)
aggData <- aggregate(x =segment18$`no_of_applicants`,
by=list(state_wise = segment18$state),
FUN = mean)
ggplot(data = aggData, aes(x = state_wise, y = prop.table(stat(aggData$x)), fill = state_wise, label = scales::percent(prop.table(stat(aggData$x))))) +
geom_bar(stat="identity", position = "dodge") +
geom_text(stat = 'identity', position = position_dodge(.9), vjust = -0.5, size = 2) +
scale_y_continuous(labels = scales::percent) +
theme_bw()+
theme(axis.text.x=element_text(angle=60,hjust=1))+
labs(x = 'state', y = 'applicants in percntage', fill = 'state') +
ggtitle("Percentage of Applicants segment 1 (2018)")
#2019
segment19<-df_segment1 %>%
filter(Year==2019)
aggData <- aggregate(x =segment19$`no_of_applicants`,
by=list(state_wise = segment19$state),
FUN = mean)
ggplot(data = aggData, aes(x = state_wise, y = prop.table(stat(aggData$x)), fill = state_wise, label = scales::percent(prop.table(stat(aggData$x))))) +
geom_bar(stat="identity", position = "dodge") +
geom_text(stat = 'identity', position = position_dodge(.9), vjust = -0.5, size = 2) +
scale_y_continuous(labels = scales::percent) +
theme_bw()+
theme(axis.text.x=element_text(angle=60,hjust=1))+
labs(x = 'state', y = 'applicants in percntage', fill = 'state') +
ggtitle("Percentage of Applicants segment 1 (2019)")
df1 <- as_tsibble( df_segment1 ,
index=application_date,
key=c(segment, branch_id, state, zone))
monthly_view <- df1 %>%
group_by_key() %>%
index_by(Year_Month = ~ yearmonth(.)) %>% # monthly aggregates
summarise(
Total_Applications = sum(no_of_applicants, na.rm = TRUE)
)
monthly_view1 <- monthly_view %>%
group_by(zone) %>%
index_by(Year_Month) %>% # monthly aggregates
summarise(
Total_Applications = sum(Total_Applications, na.rm = TRUE)
)
date_range = as.Date(c('01-03-2017','06-07-2019'), "%d-%m-%Y")
fig <- plot_ly(monthly_view1, x=~Year_Month,
y=~Total_Applications, color=~zone,
type='scatter',
mode='lines') %>%
layout(title = 'Monthly Applications v/s Zone',
xaxis = list(title = 'Months', range=date_range,
ticktext=(as.character(monthly_view1[1:27,]$Year_Month)),
tickvals=(monthly_view1[1:27,]$Year_Month),
tickmode='array'),
yaxis = list(title = 'Applicants'))
fig
mdl_dt<-dat_train[,c("segment","application_date","no_of_applicants")]
cs_trend<-mdl_dt%>%group_by(segment,application_date)%>%summarise(No_cases = sum(no_of_applicants),.groups='drop')
ggplot(cs_trend,aes(x = application_date,y = No_cases,color = segment))+geom_line(stat = "identity")+labs(title = "Trend of Applications by Segment")+scale_x_date(date_labels = "%b-%Y")+facet_grid(segment~.,scale = "free")
## Warning: Removed 1 row(s) containing missing values (geom_path).
library(ggplot2)
#segment1
level_orderM<-c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec")
month_summary <-df_segment1 %>%
group_by(Month) %>%
summarise(no_of_applicants= sum(no_of_applicants),.groups='drop')
month_summary %>%
ggplot(aes(x =factor(Month,level=level_orderM), y = no_of_applicants, fill = Month)) +
geom_col() +
scale_fill_hp_d(option = "LunaLovegood") +
scale_y_continuous(limits = c(0, 300000), expand = c(0,0)) +
labs(title = "Total number of applicants in each month in segment1", x = "Monthly", y = "Total applicants")
#segment2
level_orderM<-c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec")
month_summary <-df_segment2 %>%
group_by(Month) %>%
summarise(no_of_applicants= sum(no_of_applicants),.groups='drop')
month_summary %>%
ggplot(aes(x =factor(Month,level=level_orderM), y = no_of_applicants, fill = Month)) +
geom_col() +
scale_fill_hp_d(option = "Slytherin") +
scale_y_continuous(limits = c(0, 1231731), expand = c(0,0)) +
labs(title = "Total number of applicants in each month in segment2", x = "Monthly", y = "Total applicants")
level_orderD<-c("Mon","Tue","Wed","Thu","Fri","Sat","Sun")
wday_summary <- df_segment1 %>%
group_by(YearQrt) %>%
summarise(no_of_applicants = sum(no_of_applicants),.groups='drop')
wday_summary %>%
ggplot(aes(x =YearQrt ,y = no_of_applicants, fill = YearQrt)) +
geom_col() +
scale_fill_hp_d(option = "Ravenclaw") +
scale_y_continuous(limits = c(0, 400000), expand = c(0,0)) +
labs(title = "Total segment 1 applicants by quarter ",x = "Quarterly")
# segment2
level_orderD<-c("Mon","Tue","Wed","Thu","Fri","Sat","Sun")
wday_summary <- df_segment2 %>%
group_by(YearQrt) %>%
summarise(no_of_applicants = sum(no_of_applicants),.groups='drop')
wday_summary %>%
ggplot(aes(x =YearQrt ,y = no_of_applicants, fill = YearQrt)) +
geom_col() +
scale_fill_hp_d(option = "DracoMalfoy") +
scale_y_continuous(limits = c(0, 2000000), expand = c(0,0)) +
labs(title = "Total applicants in segment 2 by Quarter ",x = "Quarterly")
level_orderD<-c("Mon","Tue","Wed","Thu","Fri","Sat","Sun")
wday_summary <- df_segment1 %>%
group_by(WeekOfDay) %>%
summarise(no_of_applicants = mean(no_of_applicants),.groups='drop')
wday_summary %>%
ggplot(aes(x =factor(WeekOfDay,level=level_orderD) ,y = no_of_applicants, fill = WeekOfDay)) +
geom_col() +
scale_fill_hp_d(option = "HermioneGranger") +
scale_y_continuous(limits = c(0, 40), expand = c(0,0)) +
labs(title = "Avg applicants by the day of the week",x = "Days of Week")
level_orderD<-c("Mon","Tue","Wed","Thu","Fri","Sat","Sun")
level_orderM<-c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec")
wday_month_summary <- df_segment1 %>%
group_by(WeekOfDay, Month) %>%
summarise(mean_applicants = mean(no_of_applicants),.groups='drop')
wday_month_summary %>%
ggplot(aes(x = factor(WeekOfDay,level=level_orderD), y = mean_applicants, fill = WeekOfDay)) +
geom_col(width =1)+
scale_fill_hp_d(option = "DracoMalfoy") +
facet_wrap(~Month,dir="v",ncol=3,nrow= 4,as.table = TRUE) +
scale_y_continuous(limits = c(0, 80), expand = c(0,0)) +
labs(title = "Avg applicants though the week in every month",
x = "Week Of Day", y = "total applicants" )
#segment1
level_orderY<-c("2017","2018","2019","2020")
level_orderM<-c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec")
wday_month_summary <- df_segment1 %>%
group_by(Year, Month) %>%
summarise(mean_applicants = mean(no_of_applicants),.groups='drop')
wday_month_summary %>%
ggplot(aes(x = factor(Month,level=level_orderM), y = mean_applicants, fill = Month)) +
geom_col(width =1)+
scale_fill_hp_d(option = "Ravenclaw") +
facet_wrap(~Year,dir="h",ncol=1 ,as.table = TRUE) +
scale_y_continuous(limits = c(0, 70), expand = c(0,0)) +
labs(title = "Avg applicants each month across the Year in Segment1",
x = "Month Of Year ", y = "total applicants" )
#segment2
wday_month_summary <- df_segment2 %>%
group_by(Year, Month) %>%
summarise(mean_applicants = mean(no_of_applicants),.groups='drop')
wday_month_summary %>%
ggplot(aes(x = factor(Month,level=level_orderM), y = mean_applicants, fill = Month)) +
geom_col(width =1)+
scale_fill_hp_d(option = "Hufflepuff") +
facet_wrap(~Year,dir="h",ncol=1 ,as.table = TRUE) +
scale_y_continuous(limits = c(0, 1500), expand = c(0,0)) +
labs(title = "Avg applicants each month across the Year in Segment2",
x = "Month Of Year ", y = "total applicants" )